In [ ]:
import os
os.chdir("../data/raw")
os.getcwd()
Out[ ]:
'/root/restaurants/data/raw'
In [ ]:
import zipfile
with zipfile.ZipFile("dataset.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("air_reserve.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("hpg_reserve.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("air_visit_data.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("air_store_info.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("hpg_store_info.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("date_info.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
with zipfile.ZipFile("store_id_relation.csv.zip", 'r') as zip_ref:
zip_ref.extractall("")
In [ ]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy
import random
import plotly.graph_objects as go
import plotly.express as px
air_reserve¶
In [ ]:
air_reserve = pd.read_csv("air_reserve.csv")
profile = ProfileReport(air_reserve, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
air_reserve.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 92378 entries, 0 to 92377 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 air_store_id 92378 non-null object 1 visit_datetime 92378 non-null object 2 reserve_datetime 92378 non-null object 3 reserve_visitors 92378 non-null int64 dtypes: int64(1), object(3) memory usage: 2.8+ MB
In [ ]:
air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'])
air_reserve['reserve_datetime'] = pd.to_datetime(air_reserve['reserve_datetime'])
hpg_reserve¶
In [ ]:
hpg_reserve = pd.read_csv("hpg_reserve.csv")
profile = ProfileReport(hpg_reserve, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
hpg_reserve.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000320 entries, 0 to 2000319 Data columns (total 4 columns): # Column Dtype --- ------ ----- 0 hpg_store_id object 1 visit_datetime object 2 reserve_datetime object 3 reserve_visitors int64 dtypes: int64(1), object(3) memory usage: 61.0+ MB
In [ ]:
hpg_reserve['visit_datetime'] = pd.to_datetime(hpg_reserve['visit_datetime'])
hpg_reserve['reserve_datetime'] = pd.to_datetime(hpg_reserve['reserve_datetime'])
In [ ]:
hpg_reserve = hpg_reserve.rename(columns={"hpg_store_id": "store_id"})
air_reserve = air_reserve.rename(columns={"air_store_id": "store_id"})
reserve_data = pd.concat([air_reserve, hpg_reserve])
reserve_data
Out[ ]:
| store_id | visit_datetime | reserve_datetime | reserve_visitors | |
|---|---|---|---|---|
| 0 | air_877f79706adbfb06 | 2016-01-01 19:00:00 | 2016-01-01 16:00:00 | 1 |
| 1 | air_db4b38ebe7a7ceff | 2016-01-01 19:00:00 | 2016-01-01 19:00:00 | 3 |
| 2 | air_db4b38ebe7a7ceff | 2016-01-01 19:00:00 | 2016-01-01 19:00:00 | 6 |
| 3 | air_877f79706adbfb06 | 2016-01-01 20:00:00 | 2016-01-01 16:00:00 | 2 |
| 4 | air_db80363d35f10926 | 2016-01-01 20:00:00 | 2016-01-01 01:00:00 | 5 |
| ... | ... | ... | ... | ... |
| 2000315 | hpg_2b293477ea4642fb | 2017-05-31 20:00:00 | 2017-04-19 07:00:00 | 2 |
| 2000316 | hpg_f922aeca48ca5a29 | 2017-05-31 20:00:00 | 2017-03-19 00:00:00 | 3 |
| 2000317 | hpg_e9151de687b93aa3 | 2017-05-31 21:00:00 | 2017-04-14 20:00:00 | 49 |
| 2000318 | hpg_fae6c96189b4a122 | 2017-05-31 21:00:00 | 2017-04-19 17:00:00 | 9 |
| 2000319 | hpg_0b70de808b55ad1e | 2017-05-31 23:00:00 | 2017-04-21 13:00:00 | 6 |
2092698 rows × 4 columns
visit data¶
In [ ]:
hpg_visit_data = hpg_reserve.drop(columns = ['reserve_datetime']).groupby(by = ["store_id", "visit_datetime"]).sum().reset_index()
hpg_visit_data = hpg_visit_data.rename(columns={"visit_datetime": "visit_date", "reserve_visitors": "visitors"})
profile = ProfileReport(hpg_visit_data, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
hpg_visit_data['visit_date'] = pd.to_datetime(hpg_visit_data['visit_date'])
hpg_visit_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1711380 entries, 0 to 1711379 Data columns (total 3 columns): # Column Dtype --- ------ ----- 0 store_id object 1 visit_date datetime64[ns] 2 visitors int64 dtypes: datetime64[ns](1), int64(1), object(1) memory usage: 39.2+ MB
In [ ]:
air_visit_data = pd.read_csv("air_visit_data.csv")
profile = ProfileReport(air_visit_data, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
air_visit_data['visit_date'] = pd.to_datetime(air_visit_data['visit_date'])
air_visit_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 252108 entries, 0 to 252107 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 air_store_id 252108 non-null object 1 visit_date 252108 non-null datetime64[ns] 2 visitors 252108 non-null int64 dtypes: datetime64[ns](1), int64(1), object(1) memory usage: 5.8+ MB
In [ ]:
hpg_visit_data = hpg_visit_data.rename(columns={"hpg_store_id": "store_id"})
air_visit_data = air_visit_data.rename(columns={"air_store_id": "store_id"})
visit_data = pd.concat([air_visit_data, hpg_visit_data])
visit_data['visit_date'] = visit_data['visit_date'].dt.date
visit_data
Out[ ]:
| store_id | visit_date | visitors | |
|---|---|---|---|
| 0 | air_ba937bf13d40fb24 | 2016-01-13 | 25 |
| 1 | air_ba937bf13d40fb24 | 2016-01-14 | 32 |
| 2 | air_ba937bf13d40fb24 | 2016-01-15 | 29 |
| 3 | air_ba937bf13d40fb24 | 2016-01-16 | 22 |
| 4 | air_ba937bf13d40fb24 | 2016-01-18 | 6 |
| ... | ... | ... | ... |
| 1711375 | hpg_fffc097dce87af3e | 2017-03-10 | 2 |
| 1711376 | hpg_fffc097dce87af3e | 2017-03-17 | 2 |
| 1711377 | hpg_fffc097dce87af3e | 2017-03-27 | 5 |
| 1711378 | hpg_fffc097dce87af3e | 2017-04-21 | 3 |
| 1711379 | hpg_fffc097dce87af3e | 2017-04-23 | 8 |
1963488 rows × 3 columns
air_store_info¶
In [ ]:
air_store_info = pd.read_csv("air_store_info.csv")
profile = ProfileReport(air_store_info, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
hpg_store_info¶
In [ ]:
hpg_store_info = pd.read_csv("hpg_store_info.csv")
profile = ProfileReport(hpg_store_info, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
In [ ]:
hpg_store_info = hpg_store_info.rename(columns={"hpg_store_id": "store_id", "hpg_genre_name": "genre_name", "hpg_area_name": "area_name"})
air_store_info = air_store_info.rename(columns={"air_store_id": "store_id", "air_genre_name": "genre_name", "air_area_name": "area_name"})
In [ ]:
store_info = pd.concat([air_store_info, hpg_store_info])
store_info
Out[ ]:
| store_id | genre_name | area_name | latitude | longitude | |
|---|---|---|---|---|---|
| 0 | air_0f0cdeee6c9bf3d7 | Italian/French | Hyōgo-ken Kōbe-shi Kumoidōri | 34.695124 | 135.197853 |
| 1 | air_7cc17a324ae5c7dc | Italian/French | Hyōgo-ken Kōbe-shi Kumoidōri | 34.695124 | 135.197853 |
| 2 | air_fee8dcf4d619598e | Italian/French | Hyōgo-ken Kōbe-shi Kumoidōri | 34.695124 | 135.197853 |
| 3 | air_a17f0778617c76e2 | Italian/French | Hyōgo-ken Kōbe-shi Kumoidōri | 34.695124 | 135.197853 |
| 4 | air_83db5aff8f50478e | Italian/French | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| ... | ... | ... | ... | ... | ... |
| 4685 | hpg_c6ed353a48c7c93e | Sichuan food | Tōkyō-to Chiyoda-ku None | 35.695780 | 139.768453 |
| 4686 | hpg_a58d2423360b2fb4 | Sichuan food | Tōkyō-to Minato-ku Roppongi | 35.666137 | 139.733978 |
| 4687 | hpg_d2021704076cdd70 | Shanghai food | Tōkyō-to Chiyoda-ku None | 35.695780 | 139.768453 |
| 4688 | hpg_602472570d65e219 | Spain/Mediterranean cuisine | Tōkyō-to Toshima-ku None | 35.730068 | 139.711742 |
| 4689 | hpg_11801306ce8bfb4f | Udon/Soba | Osaka Prefecture Osaka None | 34.701519 | 135.498859 |
5519 rows × 5 columns
date_info¶
In [ ]:
date_info = pd.read_csv("date_info.csv")
date_info
Out[ ]:
| calendar_date | day_of_week | holiday_flg | |
|---|---|---|---|
| 0 | 2016-01-01 | Friday | 1 |
| 1 | 2016-01-02 | Saturday | 1 |
| 2 | 2016-01-03 | Sunday | 1 |
| 3 | 2016-01-04 | Monday | 0 |
| 4 | 2016-01-05 | Tuesday | 0 |
| ... | ... | ... | ... |
| 512 | 2017-05-27 | Saturday | 0 |
| 513 | 2017-05-28 | Sunday | 0 |
| 514 | 2017-05-29 | Monday | 0 |
| 515 | 2017-05-30 | Tuesday | 0 |
| 516 | 2017-05-31 | Wednesday | 0 |
517 rows × 3 columns
In [ ]:
date_info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 517 entries, 0 to 516 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 calendar_date 517 non-null object 1 day_of_week 517 non-null object 2 holiday_flg 517 non-null int64 dtypes: int64(1), object(2) memory usage: 12.2+ KB
In [ ]:
visit_data
Out[ ]:
| store_id | visit_date | visitors | |
|---|---|---|---|
| 0 | air_ba937bf13d40fb24 | 2016-01-13 | 25 |
| 1 | air_ba937bf13d40fb24 | 2016-01-14 | 32 |
| 2 | air_ba937bf13d40fb24 | 2016-01-15 | 29 |
| 3 | air_ba937bf13d40fb24 | 2016-01-16 | 22 |
| 4 | air_ba937bf13d40fb24 | 2016-01-18 | 6 |
| ... | ... | ... | ... |
| 1711375 | hpg_fffc097dce87af3e | 2017-03-10 | 2 |
| 1711376 | hpg_fffc097dce87af3e | 2017-03-17 | 2 |
| 1711377 | hpg_fffc097dce87af3e | 2017-03-27 | 5 |
| 1711378 | hpg_fffc097dce87af3e | 2017-04-21 | 3 |
| 1711379 | hpg_fffc097dce87af3e | 2017-04-23 | 8 |
1963488 rows × 3 columns
In [ ]:
visit_data
Out[ ]:
| store_id | visit_date | visitors | |
|---|---|---|---|
| 0 | air_ba937bf13d40fb24 | 2016-01-13 | 25 |
| 1 | air_ba937bf13d40fb24 | 2016-01-14 | 32 |
| 2 | air_ba937bf13d40fb24 | 2016-01-15 | 29 |
| 3 | air_ba937bf13d40fb24 | 2016-01-16 | 22 |
| 4 | air_ba937bf13d40fb24 | 2016-01-18 | 6 |
| ... | ... | ... | ... |
| 1711375 | hpg_fffc097dce87af3e | 2017-03-10 | 2 |
| 1711376 | hpg_fffc097dce87af3e | 2017-03-17 | 2 |
| 1711377 | hpg_fffc097dce87af3e | 2017-03-27 | 5 |
| 1711378 | hpg_fffc097dce87af3e | 2017-04-21 | 3 |
| 1711379 | hpg_fffc097dce87af3e | 2017-04-23 | 8 |
1963488 rows × 3 columns
In [ ]:
visit_data = visit_data.rename(columns={"visit_date": "date"})
visit_data['date'] = visit_data['date'].astype("string")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info['date'] = date_info['date'].astype("string")
In [ ]:
data = pd.merge(visit_data, date_info, on="date")
data = pd.merge(data, store_info, on="store_id")
data['date'] = pd.to_datetime(data['date'])
profile = ProfileReport(data, title="Profiling Report")
profile
data
Out[ ]:
| store_id | date | visitors | day_of_week | holiday_flg | genre_name | area_name | latitude | longitude | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | air_ba937bf13d40fb24 | 2016-01-13 | 25 | Wednesday | 0 | Dining bar | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| 1 | air_ba937bf13d40fb24 | 2016-01-14 | 32 | Thursday | 0 | Dining bar | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| 2 | air_ba937bf13d40fb24 | 2016-01-15 | 29 | Friday | 0 | Dining bar | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| 3 | air_ba937bf13d40fb24 | 2016-01-16 | 22 | Saturday | 0 | Dining bar | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| 4 | air_ba937bf13d40fb24 | 2016-01-18 | 6 | Monday | 0 | Dining bar | Tōkyō-to Minato-ku Shibakōen | 35.658068 | 139.751599 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 982086 | hpg_ffe96ca8c6ec10cf | 2017-03-31 | 8 | Friday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 |
| 982087 | hpg_ffe96ca8c6ec10cf | 2017-04-05 | 4 | Wednesday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 |
| 982088 | hpg_ffe96ca8c6ec10cf | 2017-04-07 | 11 | Friday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 |
| 982089 | hpg_ffe96ca8c6ec10cf | 2017-04-15 | 3 | Saturday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 |
| 982090 | hpg_ffe96ca8c6ec10cf | 2017-04-24 | 16 | Monday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 |
982091 rows × 9 columns
In [ ]:
temp = deepcopy(data)
temp = temp.groupby(by=['holiday_flg']).count().reset_index().drop(columns=["date", "visitors", "day_of_week", "genre_name", "area_name", "latitude", "longitude"])
temp = temp.rename(columns={"store_id": "count"})
fig = px.bar(temp, x="holiday_flg", y="count",
barmode='group',
height=400)
fig.show()
In [ ]:
temp = deepcopy(data)
temp = temp.drop(columns=["store_id", "date", "holiday_flg", "genre_name", "area_name", "latitude", "longitude"])
temp = temp.groupby('day_of_week').sum().reset_index().sort_values("visitors")
fig = px.bar(temp, x='day_of_week', y='visitors',
color='visitors',
labels={'pop':'population of Canada'}, height=400)
fig.show()
In [ ]:
store_id_relation = pd.read_csv("store_id_relation.csv")
store_id_relation
Out[ ]:
| air_store_id | hpg_store_id | |
|---|---|---|
| 0 | air_63b13c56b7201bd9 | hpg_4bc649e72e2a239a |
| 1 | air_a24bf50c3e90d583 | hpg_c34b496d0305a809 |
| 2 | air_c7f78b4f3cba33ff | hpg_cd8ae0d9bbd58ff9 |
| 3 | air_947eb2cae4f3e8f2 | hpg_de24ea49dc25d6b8 |
| 4 | air_965b2e0cf4119003 | hpg_653238a84804d8e7 |
| ... | ... | ... |
| 145 | air_b8925441167c3152 | hpg_0cc0abae3a8b52de |
| 146 | air_f8233ad00755c35c | hpg_098e4dd30e54fee6 |
| 147 | air_6607fe3671242ce3 | hpg_1004356dc8ab69ec |
| 148 | air_ccd19a5bc5573ae5 | hpg_df22bf146d2c7e19 |
| 149 | air_465bddfed3353b23 | hpg_23c5945dc1a72454 |
150 rows × 2 columns
In [ ]:
air_store_info = air_store_info.rename(columns={"store_id": "air_store_id", "genre_name": "air_genre_name", "area_name": "air_area_name", "latitude": "air_latitude", "longitude": "air_longitude"})
hpg_store_info = hpg_store_info.rename(columns={"store_id": "hpg_store_id", "genre_name": "hpg_genre_name", "area_name": "hpg_area_name", "latitude": "hpg_latitude", "longitude": "hpg_longitude"})
store_relation = pd.merge(store_id_relation, air_store_info, on="air_store_id", how="left")
store_relation = pd.merge(store_relation, hpg_store_info, on="hpg_store_id", how="left")
store_relation.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 air_store_id 150 non-null object 1 hpg_store_id 150 non-null object 2 air_genre_name 150 non-null object 3 air_area_name 150 non-null object 4 air_latitude 150 non-null float64 5 air_longitude 150 non-null float64 6 hpg_genre_name 63 non-null object 7 hpg_area_name 63 non-null object 8 hpg_latitude 63 non-null float64 9 hpg_longitude 63 non-null float64 dtypes: float64(4), object(6) memory usage: 11.8+ KB
In [ ]:
sns.heatmap(store_relation.isna(), yticklabels=False, cbar=False)
Out[ ]:
<Axes: >
In [ ]:
store_relation = pd.read_csv("store_id_relation.csv")
store_relation = pd.merge(store_relation, air_store_info, on="air_store_id", how="left")
store_relation = pd.merge(store_relation, hpg_store_info, on="hpg_store_id")
air_store_info = air_store_info.rename(columns={"air_store_id": "store_id", "air_genre_name": "genre_name", "air_area_name": "area_name", "air_latitude": "latitude", "air_longitude": "longitude"})
hpg_store_info = hpg_store_info.rename(columns={"hpg_store_id": "store_id", "hpg_genre_name": "genre_name", "hpg_area_name": "area_name", "hpg_latitude": "latitude", "hpg_longitude": "longitude"})
In [ ]:
import geopy.distance
from geopy import Point
store_relation["air_point"] = store_relation.apply(lambda row: Point(latitude=row['air_latitude'], longitude=row['air_longitude']), axis=1)
store_relation["hpg_point"] = store_relation.apply(lambda row: Point(latitude=row['hpg_latitude'], longitude=row['hpg_longitude']), axis=1)
store_relation["distance"] = store_relation["air_latitude"]
for i in range(store_relation.shape[0]):
store_relation.loc[i, "distance"] = geopy.distance.distance(store_relation.loc[i, "air_point"], store_relation.loc[i, "hpg_point"]).km
store_relation = store_relation.sort_values("distance").reset_index(drop=True)
store_relation.tail()
Out[ ]:
| air_store_id | hpg_store_id | air_genre_name | air_area_name | air_latitude | air_longitude | hpg_genre_name | hpg_area_name | hpg_latitude | hpg_longitude | air_point | hpg_point | distance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 58 | air_d00161e19f08290b | hpg_05fada27f04e4383 | Dining bar | Tōkyō-to Shinjuku-ku Kabukichō | 35.693840 | 139.703549 | International cuisine | None None None | 35.688589 | 139.731971 | 35 41m 37.8244s N, 139 42m 12.7778s E | 35 41m 18.9216s N, 139 43m 55.0955s E | 2.637691 |
| 59 | air_c1ff20617c54fee7 | hpg_4be4a5cb851e45af | Izakaya | Tōkyō-to Meguro-ku Kamimeguro | 35.641463 | 139.698171 | International cuisine | Tōkyō-to Minato-ku Takanawa | 35.637183 | 139.737998 | 35 38m 29.2664s N, 139 41m 53.4163s E | 35 38m 13.8586s N, 139 44m 16.7929s E | 3.638336 |
| 60 | air_258ad2619d7bff9a | hpg_e87d00be48cd91c6 | Izakaya | Tōkyō-to Kōtō-ku Tōyō | 35.672854 | 139.817410 | Japanese style | Tōkyō-to Sumida-ku Tachibana | 35.704960 | 139.828642 | 35 40m 22.2726s N, 139 49m 2.67492s E | 35 42m 17.8562s N, 139 49m 43.1123s E | 3.704582 |
| 61 | air_48f4da6223571da4 | hpg_832ba309e6699258 | Italian/French | Tōkyō-to Tachikawa-shi Izumichō | 35.714014 | 139.407843 | Italian | Tōkyō-to Hachiōji-shi Ishikawamachi | 35.677207 | 139.373870 | 35 42m 50.4511s N, 139 24m 28.2352s E | 35 40m 37.9442s N, 139 22m 25.9306s E | 5.112082 |
| 62 | air_640cf4835f0d9ba3 | hpg_e63863c8987bd064 | Izakaya | Tōkyō-to Ōta-ku Kamata | 35.561257 | 139.716051 | Japanese style | Kanagawa-ken Yokohama-shi Nagatsutachō | 35.512762 | 139.495733 | 35 33m 40.5245s N, 139 42m 57.784s E | 35 30m 45.9439s N, 139 29m 44.6382s E | 20.691947 |
In [ ]:
import plotly.express as px
fig = px.histogram(store_relation, x='distance')
fig.show()
In [ ]:
def get_delta(df):
temp = deepcopy(df)
temp["delta"] = temp["visit_datetime"] - temp["reserve_datetime"]
temp = temp.rename(columns={"visit_datetime": "date"})
temp['date'] = temp['date'].dt.date
temp = temp.drop(columns=["reserve_datetime", "store_id"])
def delta_days(x):
if x.days < 1:
return "<1"
elif x.days <= 7:
return "1 - 7"
else:
return ">7"
temp["delta_days"] = temp["delta"].apply(delta_days)
temp = temp.groupby(by=["date", "delta_days"]).reserve_visitors.agg(["sum", "count"]).reset_index().sort_values("date")
fig = px.line(temp, x="date", y="count", color='delta_days')
fig.show()
fig = px.line(temp, x="date", y="sum", color='delta_days')
fig.show()
In [ ]:
get_delta(air_reserve)
In [ ]:
get_delta(reserve_data)
In [ ]:
import plotly.express as px
temp = deepcopy(visit_data)
temp["store_id"] = temp["store_id"].str[:3]
temp = temp.groupby(by=["date", "store_id"]).sum().reset_index()
fig = px.line(temp, x="date", y="visitors", color='store_id')
fig.show()
In [ ]:
import plotly.express as px
temp = deepcopy(data)
temp["store_id"] = temp["store_id"].str[:3]
temp1 = deepcopy(temp)
temp1["store_id"] = "all"
temp = pd.concat([temp, temp1])
temp = temp.groupby(by=["date", "store_id"]).sum().reset_index()
fig = px.line(temp, x="date", y="visitors", color='store_id')
fig.show()
In [ ]:
len(visit_data['store_id'].unique())
Out[ ]:
14154
In [ ]:
len(data['store_id'].unique())
Out[ ]:
5519
In [ ]:
temp = deepcopy(reserve_data)
temp["delta"] = temp["visit_datetime"] - temp["reserve_datetime"]
temp = temp.rename(columns={"visit_datetime": "date"})
temp['date'] = temp['date'].dt.date
temp = temp.drop(columns=["reserve_datetime"])
temp = temp.groupby(by=["store_id", "date"]).sum().reset_index()
temp["date"] = pd.to_datetime(temp["date"])
data = pd.merge(data, temp, on=["store_id", "date"])
In [ ]:
data
Out[ ]:
| store_id | date | visitors | day_of_week | holiday_flg | genre_name | area_name | latitude | longitude | reserve_visitors | delta | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | air_35512c42db0868da | 2016-02-27 | 6 | Saturday | 0 | Dining bar | Tōkyō-to Musashino-shi Midorichō | 35.717784 | 139.566260 | 12 | 0 days 01:00:00 |
| 1 | air_ee3a01f0c71a769f | 2016-01-04 | 61 | Monday | 0 | Cafe/Sweets | Shizuoka-ken Hamamatsu-shi Motoshirochō | 34.710895 | 137.725940 | 2 | 0 days 04:00:00 |
| 2 | air_ee3a01f0c71a769f | 2016-01-08 | 21 | Friday | 0 | Cafe/Sweets | Shizuoka-ken Hamamatsu-shi Motoshirochō | 34.710895 | 137.725940 | 3 | 2 days 05:00:00 |
| 3 | air_ee3a01f0c71a769f | 2016-01-09 | 57 | Saturday | 0 | Cafe/Sweets | Shizuoka-ken Hamamatsu-shi Motoshirochō | 34.710895 | 137.725940 | 25 | 34 days 13:00:00 |
| 4 | air_ee3a01f0c71a769f | 2016-01-10 | 32 | Sunday | 0 | Cafe/Sweets | Shizuoka-ken Hamamatsu-shi Motoshirochō | 34.710895 | 137.725940 | 5 | 4 days 09:00:00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 758042 | hpg_ffe96ca8c6ec10cf | 2017-03-31 | 8 | Friday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 | 8 | 29 days 03:00:00 |
| 758043 | hpg_ffe96ca8c6ec10cf | 2017-04-05 | 4 | Wednesday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 | 4 | 13 days 10:00:00 |
| 758044 | hpg_ffe96ca8c6ec10cf | 2017-04-07 | 11 | Friday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 | 11 | 6 days 21:00:00 |
| 758045 | hpg_ffe96ca8c6ec10cf | 2017-04-15 | 3 | Saturday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 | 3 | 40 days 06:00:00 |
| 758046 | hpg_ffe96ca8c6ec10cf | 2017-04-24 | 16 | Monday | 0 | Italian | Tōkyō-to Chūō-ku None | 35.682504 | 139.773619 | 16 | 5 days 22:00:00 |
758047 rows × 11 columns
In [ ]:
def samples(df):
ids = random.choices(data.where(data["store_id"].str[:3] == "air")["store_id"].unique(), k=3)
for id in ids:
temp = df.where(df["store_id"] == id).dropna().reset_index(drop=True)
temp.sort_values("date")
fig = go.Figure()
fig.add_trace(go.Scatter(x=temp["date"], y=temp["visitors"],
mode='lines',
name='visitors'))
fig.add_trace(go.Scatter(x=temp["date"], y=temp["reserve_visitors"],
mode='lines',
name='reserve_visitors'))
fig.update_layout(title=id,
xaxis_title='date')
fig.show()
print(id)
In [ ]:
samples(data)
air_63a750d8b4b6a976
air_d07e57b21109304a
air_3e93f3c81008696d
In [ ]:
temp = deepcopy(data)
temp["month"] = [date.month_name() for date in temp["date"]]
temp = temp.drop(columns=["store_id", "date", "holiday_flg", "genre_name", "area_name", "latitude", "longitude", "day_of_week"])
temp = temp.groupby('month').mean().reset_index().sort_values("visitors")
fig = px.bar(temp, x='month', y='visitors',
color='visitors', height=400)
fig.show()
In [ ]:
fig = px.bar(data.groupby("holiday_flg")["visitors"].mean().reset_index().sort_values("visitors"), x='holiday_flg', y='visitors',
color='visitors', height=400)
fig.show()
In [ ]:
def genre_area_stores(df):
temp = deepcopy(df)
temp = temp.groupby('genre_name').count().reset_index()
temp = temp.drop(columns=["store_id", "latitude", "longitude"])
temp = temp.rename(columns={"area_name": "stores"}).sort_values("stores")
fig = px.bar(temp.tail(50), x='genre_name', y='stores',
color='stores', height=400)
fig.show()
temp = deepcopy(df)
temp = temp.groupby('area_name').count().reset_index()
temp = temp.drop(columns=["store_id", "latitude", "longitude"])
temp = temp.rename(columns={"genre_name": "stores"}).sort_values("stores")
fig = px.bar(temp.tail(50), x='area_name', y='stores',
color='stores', height=600)
fig.show()
In [ ]:
genre_area_stores(air_store_info)
In [ ]:
genre_area_stores(hpg_store_info)
In [ ]:
genre_area_stores(data)
In [ ]:
def genre_area_visitors(df):
temp = deepcopy(df.drop(columns=["store_id", "latitude", "longitude", "date", "day_of_week", "holiday_flg", "delta", "area_name", "reserve_visitors"]))
temp = temp.groupby('genre_name').sum().reset_index().sort_values("visitors")
fig = px.bar(temp.tail(50), x='genre_name', y='visitors',
color='visitors', height=400)
fig.show()
temp = deepcopy(df.drop(columns=["store_id", "latitude", "longitude", "date", "day_of_week", "holiday_flg", "delta", "genre_name", "reserve_visitors"]))
temp = temp.groupby('area_name').sum().reset_index().sort_values("visitors")
fig = px.bar(temp.tail(50), x='area_name', y='visitors',
color='visitors', height=600)
fig.show()
In [ ]:
genre_area_visitors(data)
In [ ]:
# from shapely.geometry import Point
# import geopandas
# longitudes = data["longitude"].tolist()
# latitudes = data["latitude"].tolist()
# world = geopandas.read_file(geopandas.datasets.get_path("naturalearth_lowres"))
# japan = world[world.name == 'Japan']
# fig,ax = plt.subplots(figsize = (10,12))
# japan.plot(ax = ax)
# geometry = [Point(xy) for xy in zip(longitudes,latitudes)]
# geo_df = geopandas.GeoDataFrame(geometry = geometry)
# g = geo_df.plot(ax = ax, markersize = 20, color = 'red',marker = '*',label = 'Delhi')
# plt.show()
In [ ]:
profile = ProfileReport(data, title="Profiling Report")
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[ ]:
The Kernel crashed while executing code in the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. View Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details.